require(quanteda)

# collect word-level Wordfish parameters (beta, psi) by committee and year
beta.psi.list <- feature.list <- list()
for (i in 1:12) {
  beta.psi.list[[i]] <- list()
  for (j in 1:61) {
    beta.psi.list[[i]][[j]] <- NA
  }
  feature.list[[i]] <- NA
}
names(beta.psi.list) <- names(feature.list) <- c("内閣委員会", "総務委員会", "法務委員会", "外交防衛委員会", 
                                                 "財務金融委員会", "文部科学委員会", "厚生労働委員会", 
                                                 "農林水産委員会", "経済産業委員会", "国土交通委員会", 
                                                 "交通・情報通信・国土・環境委員会", "環境委員会")
for (i in 1:61) {
  positions <- readRDS(paste0("Wordfish/positions_", 1958 + i, ".rds"))
  committee.list <- colnames(positions)
  wordfish.list <- readRDS(paste0("Wordfish/Wordfish_results_", 1958 + i, ".rds"))
  for (j in 1:length(committee.list)) {
    if (! is.nan(wordfish.list[[j]]$theta[1])) {
      beta.psi.list[[committee.list[j]]][[i]] <- 
        data.frame(feature = wordfish.list[[j]]$features, 
                   beta = wordfish.list[[j]]$beta, 
                   psi = wordfish.list[[j]]$psi)
      feature.list[[committee.list[j]]] <- 
        unique(c(feature.list[[committee.list[j]]], wordfish.list[[j]]$features))
    }
  }
}

# drop initial NA placeholders
for (i in 1:12) {
  feature.list[[i]] <- feature.list[[i]][-1]
}

# merge beta/psi across years and compute time averages by committee
merged.beta <- merged.psi <- average.beta <- list()
for (i in 1:12) {
  merged.beta[[i]] <- merged.psi[[i]] <- matrix(NA, length(feature.list[[i]]), 61)
  rownames(merged.beta[[i]]) <- feature.list[[i]]
  for (j in 1:61) {
    if (is.data.frame(beta.psi.list[[i]][[j]])) {
      merged.beta[[i]][match(beta.psi.list[[i]][[j]]$feature, feature.list[[i]]), j] <- 
        beta.psi.list[[i]][[j]]$beta
      merged.psi[[i]][match(beta.psi.list[[i]][[j]]$feature, feature.list[[i]]), j] <- 
        beta.psi.list[[i]][[j]]$psi
    }
  }
  # keep years with at least one observed word parameter
  merged.beta[[i]] <- merged.beta[[i]][, apply(merged.beta[[i]], 2, function(x) sum(! is.na(x))) > 0]
  merged.psi[[i]] <- merged.psi[[i]][, apply(merged.psi[[i]], 2, function(x) sum(! is.na(x))) > 0]
  # restrict to relatively frequent words (average psi above the median)
  average.psi <- rowMeans(merged.psi[[i]], na.rm = TRUE)
  merged.beta[[i]] <- merged.beta[[i]][average.psi > quantile(average.psi, 0.5), ]
  # average beta over time (word contribution)
  average.beta[[i]] <- rowMeans(merged.beta[[i]])
}

# extract top contributing words (most negative/positive average beta) for each committee
committee.labels <- c("Cabinet", "General Affairs", "Judicial Affairs", 
                      "Foreign Affairs and Defense", 
                      "Financial Affairs", "Education, Culture, and Science", 
                      "Health, Welfare, and Labor", 
                      "Agriculture, Forestry, and Fisheries", 
                      "Economy and Industry", "Land and Transport", 
                      "Trans, Info, Land, and Envir", "Environment")
important.words <- data.frame(committee = rep(committee.labels, each = 10), 
                              neg.words = NA, neg.beta = NA, 
                              pos.words = NA, pos.beta = NA)
for (i in 1:12) {
  important.words$neg.words[(10 * (i - 1) + 1):(10 * i)] <- names(head(sort(average.beta[[i]]), 10))
  important.words$neg.beta[(10 * (i - 1) + 1):(10 * i)] <- round(head(sort(average.beta[[i]]), 10), 2)
  important.words$pos.words[(10 * (i - 1) + 1):(10 * i)] <- names(head(sort(average.beta[[i]], decreasing = TRUE), 10))
  important.words$pos.beta[(10 * (i - 1) + 1):(10 * i)] <- round(head(sort(average.beta[[i]], decreasing = TRUE), 10), 2)
}

# Table A.3: frequently used words by the left and right camps in each committee
important.words
